filter(flights, arr_delay >= 120)
## # A tibble: 10,200 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 811 630 101. 1047
## 2 2013 1 1 848 1835 853. 1001
## 3 2013 1 1 957 733 144. 1056
## 4 2013 1 1 1114 900 134. 1447
## 5 2013 1 1 1505 1310 115. 1638
## 6 2013 1 1 1525 1340 105. 1831
## 7 2013 1 1 1549 1445 64. 1912
## 8 2013 1 1 1558 1359 119. 1718
## 9 2013 1 1 1732 1630 62. 2028
## 10 2013 1 1 1803 1620 103. 2008
## # ... with 10,190 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights, dest == "IAH" | dest == "HOU")
## # A tibble: 9,313 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2. 830
## 2 2013 1 1 533 529 4. 850
## 3 2013 1 1 623 627 -4. 933
## 4 2013 1 1 728 732 -4. 1041
## 5 2013 1 1 739 739 0. 1104
## 6 2013 1 1 908 908 0. 1228
## 7 2013 1 1 1028 1026 2. 1350
## 8 2013 1 1 1044 1045 -1. 1352
## 9 2013 1 1 1114 900 134. 1447
## 10 2013 1 1 1205 1200 5. 1503
## # ... with 9,303 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights, carrier == "UA" | carrier == "AA" | carrier == "DL")
## # A tibble: 139,504 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2. 830
## 2 2013 1 1 533 529 4. 850
## 3 2013 1 1 542 540 2. 923
## 4 2013 1 1 554 600 -6. 812
## 5 2013 1 1 554 558 -4. 740
## 6 2013 1 1 558 600 -2. 753
## 7 2013 1 1 558 600 -2. 924
## 8 2013 1 1 558 600 -2. 923
## 9 2013 1 1 559 600 -1. 941
## 10 2013 1 1 559 600 -1. 854
## # ... with 139,494 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights, month >= 7, month <= 9)
## # A tibble: 86,326 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 7 1 1 2029 212. 236
## 2 2013 7 1 2 2359 3. 344
## 3 2013 7 1 29 2245 104. 151
## 4 2013 7 1 43 2130 193. 322
## 5 2013 7 1 44 2150 174. 300
## 6 2013 7 1 46 2051 235. 304
## 7 2013 7 1 48 2001 287. 308
## 8 2013 7 1 58 2155 183. 335
## 9 2013 7 1 100 2146 194. 327
## 10 2013 7 1 100 2245 135. 337
## # ... with 86,316 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights, arr_delay >= 120, dep_delay <= 0)
## # A tibble: 29 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 27 1419 1420 -1. 1754
## 2 2013 10 7 1350 1350 0. 1736
## 3 2013 10 7 1357 1359 -2. 1858
## 4 2013 10 16 657 700 -3. 1258
## 5 2013 11 1 658 700 -2. 1329
## 6 2013 3 18 1844 1847 -3. 39
## 7 2013 4 17 1635 1640 -5. 2049
## 8 2013 4 18 558 600 -2. 1149
## 9 2013 4 18 655 700 -5. 1213
## 10 2013 5 22 1827 1830 -3. 2217
## # ... with 19 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights, dep_delay >= 60, dep_delay - arr_delay >= 30)
## # A tibble: 2,074 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 1716 1545 91. 2140
## 2 2013 1 1 2205 1720 285. 46
## 3 2013 1 1 2326 2130 116. 131
## 4 2013 1 3 1503 1221 162. 1803
## 5 2013 1 3 1821 1530 171. 2131
## 6 2013 1 3 1839 1700 99. 2056
## 7 2013 1 3 1850 1745 65. 2148
## 8 2013 1 3 1923 1815 68. 2036
## 9 2013 1 3 1941 1759 102. 2246
## 10 2013 1 3 1950 1845 65. 2228
## # ... with 2,064 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights, dep_time >= 0, dep_time <= 600)
## # A tibble: 9,344 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2. 830
## 2 2013 1 1 533 529 4. 850
## 3 2013 1 1 542 540 2. 923
## 4 2013 1 1 544 545 -1. 1004
## 5 2013 1 1 554 600 -6. 812
## 6 2013 1 1 554 558 -4. 740
## 7 2013 1 1 555 600 -5. 913
## 8 2013 1 1 557 600 -3. 709
## 9 2013 1 1 557 600 -3. 838
## 10 2013 1 1 558 600 -2. 753
## # ... with 9,334 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
between(). What does it do? Can you use it to simplify the code needed to answer the previous challenges?between() checks if values in a numeric vector fall between a specified range.
Using between() in previous challenges: # 4)
filter(flights, between(month, 7, 9))
## # A tibble: 86,326 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 7 1 1 2029 212. 236
## 2 2013 7 1 2 2359 3. 344
## 3 2013 7 1 29 2245 104. 151
## 4 2013 7 1 43 2130 193. 322
## 5 2013 7 1 44 2150 174. 300
## 6 2013 7 1 46 2051 235. 304
## 7 2013 7 1 48 2001 287. 308
## 8 2013 7 1 58 2155 183. 335
## 9 2013 7 1 100 2146 194. 327
## 10 2013 7 1 100 2245 135. 337
## # ... with 86,316 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights, between(dep_time, 0, 600))
## # A tibble: 9,344 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2. 830
## 2 2013 1 1 533 529 4. 850
## 3 2013 1 1 542 540 2. 923
## 4 2013 1 1 544 545 -1. 1004
## 5 2013 1 1 554 600 -6. 812
## 6 2013 1 1 554 558 -4. 740
## 7 2013 1 1 555 600 -5. 913
## 8 2013 1 1 557 600 -3. 709
## 9 2013 1 1 557 600 -3. 838
## 10 2013 1 1 558 600 -2. 753
## # ... with 9,334 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
dep_time? What other variables are missing? What might these rows represent?filter(flights, is.na(dep_time))
## # A tibble: 8,255 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 NA 1630 NA NA
## 2 2013 1 1 NA 1935 NA NA
## 3 2013 1 1 NA 1500 NA NA
## 4 2013 1 1 NA 600 NA NA
## 5 2013 1 2 NA 1540 NA NA
## 6 2013 1 2 NA 1620 NA NA
## 7 2013 1 2 NA 1355 NA NA
## 8 2013 1 2 NA 1420 NA NA
## 9 2013 1 2 NA 1321 NA NA
## 10 2013 1 2 NA 1545 NA NA
## # ... with 8,245 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
8,255 flights have a missing dep_time. These flights are also missing dep_delay, arr_time, and arr_delay, suggesting that these flights did not fly anywhere and thus likely represent canceled flights.
NA ^ 0 not missing? Why is NA | TRUE not missing? Why is FALSE & NA not missing? Can you figure out the general rule? (NA * 0 is a tricky counterexample!)NA ^ 0
## [1] 1
NA | TRUE
## [1] TRUE
FALSE & NA
## [1] FALSE
NA * 0
## [1] NA
NA ^ 0 is not missing because any value to the 0th power equals 1. NA | TRUE is not missing because the second condition is TRUE, thus the result is TRUE (or logic - doesn’t matter what the other condition is). FALSE & NA is not missing because the first condition is FALSE, thus the result is FALSE (and logic - FALSE and anything will always be FALSE). However, NA * 0 is NA.
The general rule is that if the value of the missing value could potentially change the outcome of the operation, the result is missing (for example, with NA * 0, for most values of the missing value, the result would be 0; however, if the missing value was Inf, then the result would be NaN).
arrange() to sort all missing values to the start? (Hint: use is.na()).arrange(flights, desc(is.na(dep_time)))
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 NA 1630 NA NA
## 2 2013 1 1 NA 1935 NA NA
## 3 2013 1 1 NA 1500 NA NA
## 4 2013 1 1 NA 600 NA NA
## 5 2013 1 2 NA 1540 NA NA
## 6 2013 1 2 NA 1620 NA NA
## 7 2013 1 2 NA 1355 NA NA
## 8 2013 1 2 NA 1420 NA NA
## 9 2013 1 2 NA 1321 NA NA
## 10 2013 1 2 NA 1545 NA NA
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
flights to find the most delayed flights. Find the flights that left earliest.Most delayed flights:
arrange(flights, desc(dep_delay))
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 9 641 900 1301. 1242
## 2 2013 6 15 1432 1935 1137. 1607
## 3 2013 1 10 1121 1635 1126. 1239
## 4 2013 9 20 1139 1845 1014. 1457
## 5 2013 7 22 845 1600 1005. 1044
## 6 2013 4 10 1100 1900 960. 1342
## 7 2013 3 17 2321 810 911. 135
## 8 2013 6 27 959 1900 899. 1236
## 9 2013 7 22 2257 759 898. 121
## 10 2013 12 5 756 1700 896. 1058
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
Flights that left earliest:
arrange(flights, dep_delay)
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 12 7 2040 2123 -43. 40
## 2 2013 2 3 2022 2055 -33. 2240
## 3 2013 11 10 1408 1440 -32. 1549
## 4 2013 1 11 1900 1930 -30. 2233
## 5 2013 1 29 1703 1730 -27. 1947
## 6 2013 8 9 729 755 -26. 1002
## 7 2013 10 23 1907 1932 -25. 2143
## 8 2013 3 30 2030 2055 -25. 2213
## 9 2013 3 2 1431 1455 -24. 1601
## 10 2013 5 5 934 958 -24. 1225
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
flights to find the fastest flights.arrange(flights, desc(distance / air_time))
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 5 25 1709 1700 9. 1923
## 2 2013 7 2 1558 1513 45. 1745
## 3 2013 5 13 2040 2025 15. 2225
## 4 2013 3 23 1914 1910 4. 2045
## 5 2013 1 12 1559 1600 -1. 1849
## 6 2013 11 17 650 655 -5. 1059
## 7 2013 2 21 2355 2358 -3. 412
## 8 2013 11 17 759 800 -1. 1212
## 9 2013 11 16 2003 1925 38. 17
## 10 2013 11 16 2349 2359 -10. 402
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
Traveled longest:
arrange(flights, desc(distance))
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 857 900 -3. 1516
## 2 2013 1 2 909 900 9. 1525
## 3 2013 1 3 914 900 14. 1504
## 4 2013 1 4 900 900 0. 1516
## 5 2013 1 5 858 900 -2. 1519
## 6 2013 1 6 1019 900 79. 1558
## 7 2013 1 7 1042 900 102. 1620
## 8 2013 1 8 901 900 1. 1504
## 9 2013 1 9 641 900 1301. 1242
## 10 2013 1 10 859 900 -1. 1449
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
Traveled shortest:
arrange(flights, distance)
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 7 27 NA 106 NA NA
## 2 2013 1 3 2127 2129 -2. 2222
## 3 2013 1 4 1240 1200 40. 1333
## 4 2013 1 4 1829 1615 134. 1937
## 5 2013 1 4 2128 2129 -1. 2218
## 6 2013 1 5 1155 1200 -5. 1241
## 7 2013 1 6 2125 2129 -4. 2224
## 8 2013 1 7 2124 2129 -5. 2212
## 9 2013 1 8 2127 2130 -3. 2304
## 10 2013 1 9 2126 2129 -3. 2217
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
dep_time, dep_delay, arr_time, and arr_delay from flights.select(flights, dep_time, dep_delay, arr_time, arr_delay)
## # A tibble: 336,776 x 4
## dep_time dep_delay arr_time arr_delay
## <int> <dbl> <int> <dbl>
## 1 517 2. 830 11.
## 2 533 4. 850 20.
## 3 542 2. 923 33.
## 4 544 -1. 1004 -18.
## 5 554 -6. 812 -25.
## 6 554 -4. 740 12.
## 7 555 -5. 913 19.
## 8 557 -3. 709 -14.
## 9 557 -3. 838 -8.
## 10 558 -2. 753 8.
## # ... with 336,766 more rows
select(flights, starts_with("dep"), starts_with("arr"))
## # A tibble: 336,776 x 4
## dep_time dep_delay arr_time arr_delay
## <int> <dbl> <int> <dbl>
## 1 517 2. 830 11.
## 2 533 4. 850 20.
## 3 542 2. 923 33.
## 4 544 -1. 1004 -18.
## 5 554 -6. 812 -25.
## 6 554 -4. 740 12.
## 7 555 -5. 913 19.
## 8 557 -3. 709 -14.
## 9 557 -3. 838 -8.
## 10 558 -2. 753 8.
## # ... with 336,766 more rows
select(flights, 4, 6, 7, 9)
## # A tibble: 336,776 x 4
## dep_time dep_delay arr_time arr_delay
## <int> <dbl> <int> <dbl>
## 1 517 2. 830 11.
## 2 533 4. 850 20.
## 3 542 2. 923 33.
## 4 544 -1. 1004 -18.
## 5 554 -6. 812 -25.
## 6 554 -4. 740 12.
## 7 555 -5. 913 19.
## 8 557 -3. 709 -14.
## 9 557 -3. 838 -8.
## 10 558 -2. 753 8.
## # ... with 336,766 more rows
select() call?select(flights, air_time, air_time)
## # A tibble: 336,776 x 1
## air_time
## <dbl>
## 1 227.
## 2 227.
## 3 160.
## 4 183.
## 5 116.
## 6 150.
## 7 158.
## 8 53.
## 9 140.
## 10 138.
## # ... with 336,766 more rows
Even if you include the name of a variable multiple times in a select() call, the variable is only included once in the data frame.
one_of() function do? Why might it be helpful in conjunction with this vector?vars <- c("year", "month", "day", "dep_delay", "arr_delay")
The one_of() function enables you to select variables matching elements in a character vector.
For example, you could select the variables in vars from flights:
select(flights, one_of(vars))
## # A tibble: 336,776 x 5
## year month day dep_delay arr_delay
## <int> <int> <int> <dbl> <dbl>
## 1 2013 1 1 2. 11.
## 2 2013 1 1 4. 20.
## 3 2013 1 1 2. 33.
## 4 2013 1 1 -1. -18.
## 5 2013 1 1 -6. -25.
## 6 2013 1 1 -4. 12.
## 7 2013 1 1 -5. 19.
## 8 2013 1 1 -3. -14.
## 9 2013 1 1 -3. -8.
## 10 2013 1 1 -2. 8.
## # ... with 336,766 more rows
select(flights, contains("TIME"))
## # A tibble: 336,776 x 6
## dep_time sched_dep_time arr_time sched_arr_time air_time
## <int> <int> <int> <int> <dbl>
## 1 517 515 830 819 227.
## 2 533 529 850 830 227.
## 3 542 540 923 850 160.
## 4 544 545 1004 1022 183.
## 5 554 600 812 837 116.
## 6 554 558 740 728 150.
## 7 555 600 913 854 158.
## 8 557 600 709 723 53.
## 9 557 600 838 846 140.
## 10 558 600 753 745 138.
## # ... with 336,766 more rows, and 1 more variable: time_hour <dttm>
The select helpers ignore case by default. You can change that default by setting ignore.case = FALSE, as shown below.
select(flights, contains("TIME", ignore.case = 'FALSE'))
## # A tibble: 336,776 x 0
dep_time and sched_dep_time are convenient to look at, but hard to compute with because they’re not really continuous numbers. Convert them to a more convenient representation of number of minutes since midnight.mutate(flights, dep_time_min = ((dep_time %/% 100 * 60) + (dep_time %% 100)) %% 1440,
sched_dep_time_min = ((sched_dep_time %/% 100 * 60) + (sched_dep_time %% 100)) %% 1440)
## # A tibble: 336,776 x 21
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2. 830
## 2 2013 1 1 533 529 4. 850
## 3 2013 1 1 542 540 2. 923
## 4 2013 1 1 544 545 -1. 1004
## 5 2013 1 1 554 600 -6. 812
## 6 2013 1 1 554 558 -4. 740
## 7 2013 1 1 555 600 -5. 913
## 8 2013 1 1 557 600 -3. 709
## 9 2013 1 1 557 600 -3. 838
## 10 2013 1 1 558 600 -2. 753
## # ... with 336,766 more rows, and 14 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, dep_time_min <dbl>,
## # sched_dep_time_min <dbl>
(Need to have %% 1440 at the end or otherwise midnight (2400) will be 24 * 60 = 1440, rather than 0)
air_time with arr_time - dep_time. What do you expect to see? What do you see? What do you need to do to fix it?f1 <- mutate(flights, diff = arr_time - dep_time)
select(f1, air_time, diff)
## # A tibble: 336,776 x 2
## air_time diff
## <dbl> <int>
## 1 227. 313
## 2 227. 317
## 3 160. 381
## 4 183. 460
## 5 116. 258
## 6 150. 186
## 7 158. 358
## 8 53. 152
## 9 140. 281
## 10 138. 195
## # ... with 336,766 more rows
I would expect air_time and arr_time - dep_time to be the same, but that is not the case. This is because while air_time is in minutes, arr_time and dep_time are in HHMM or HMM format. To fix the problem, arr_time and dep_time need to be converted into continuous numbers.
min_rank().filter(flights, min_rank(desc(dep_delay)) <= 10)
## # A tibble: 10 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 9 641 900 1301. 1242
## 2 2013 1 10 1121 1635 1126. 1239
## 3 2013 12 5 756 1700 896. 1058
## 4 2013 3 17 2321 810 911. 135
## 5 2013 4 10 1100 1900 960. 1342
## 6 2013 6 15 1432 1935 1137. 1607
## 7 2013 6 27 959 1900 899. 1236
## 8 2013 7 22 845 1600 1005. 1044
## 9 2013 7 22 2257 759 898. 121
## 10 2013 9 20 1139 1845 1014. 1457
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
min_rank() assigns ties to the lowest rank.
1:3 + 1:10 return? Why?1:3 + 1:10
## Warning in 1:3 + 1:10: longer object length is not a multiple of shorter
## object length
## [1] 2 4 6 5 7 9 8 10 12 11
Since the length of the longer vector is not a multiple of the length of the shorter vector, the entirety of the shorter vector is added to the first three elements of the longer vector, the fourth through sixth elements of the longer vector, and the seventh through ninth elements of the longer vector, but then only the first element of the shorter vector is added to the last element of the longer vector.
Cosine, sine, tangent, arc-cosine, arc-sine, arc-tangent, and the two-argument arc-tangent.
(delay_chars <- flights %>%
group_by(flight) %>%
summarize(fifteen_min_early = mean(arr_delay == -15, na.rm = TRUE),
fifteen_min_late = mean(arr_delay == 15, na.rm = TRUE),
ten_min_late = mean(arr_delay == 10, na.rm = TRUE),
thirty_min_early = mean(arr_delay == -30, na.rm = TRUE),
thirty_min_late = mean(arr_delay == 30, na.rm = TRUE),
on_time = mean(arr_delay == 0, na.rm = TRUE),
two_hr_late = mean(arr_delay == 120, na.rm = TRUE)))
## # A tibble: 3,844 x 8
## flight fifteen_min_early fifteen_min_late ten_min_late thirty_min_early
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 0.0215 0.0100 0.00574 0.00574
## 2 2 0.0392 0.0196 0. 0.
## 3 3 0.00955 0.00637 0.0159 0.0159
## 4 4 0.0358 0.0102 0.00767 0.0128
## 5 5 0.0123 0.00617 0.00926 0.0216
## 6 6 0.0291 0.00485 0.00485 0.0291
## 7 7 0.0169 0.00424 0. 0.00847
## 8 8 0.0556 0.00855 0.0214 0.
## 9 9 0.0132 0.0132 0.0197 0.
## 10 10 0.0164 0.0164 0.0328 0.
## # ... with 3,834 more rows, and 3 more variables: thirty_min_late <dbl>,
## # on_time <dbl>, two_hr_late <dbl>
filter(delay_chars, fifteen_min_early == 0.5, fifteen_min_late == 0.5)
## # A tibble: 0 x 8
## # ... with 8 variables: flight <int>, fifteen_min_early <dbl>,
## # fifteen_min_late <dbl>, ten_min_late <dbl>, thirty_min_early <dbl>,
## # thirty_min_late <dbl>, on_time <dbl>, two_hr_late <dbl>
filter(delay_chars, ten_min_late == 1)
## # A tibble: 5 x 8
## flight fifteen_min_early fifteen_min_late ten_min_late thirty_min_early
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 2254 0. 0. 1. 0.
## 2 3656 0. 0. 1. 0.
## 3 3785 0. 0. 1. 0.
## 4 3880 0. 0. 1. 0.
## 5 5854 0. 0. 1. 0.
## # ... with 3 more variables: thirty_min_late <dbl>, on_time <dbl>,
## # two_hr_late <dbl>
filter(delay_chars, thirty_min_early == 0.5, thirty_min_late == 0.5)
## # A tibble: 0 x 8
## # ... with 8 variables: flight <int>, fifteen_min_early <dbl>,
## # fifteen_min_late <dbl>, ten_min_late <dbl>, thirty_min_early <dbl>,
## # thirty_min_late <dbl>, on_time <dbl>, two_hr_late <dbl>
filter(delay_chars, on_time == 0.99, two_hr_late == 0.01)
## # A tibble: 0 x 8
## # ... with 8 variables: flight <int>, fifteen_min_early <dbl>,
## # fifteen_min_late <dbl>, ten_min_late <dbl>, thirty_min_early <dbl>,
## # thirty_min_late <dbl>, on_time <dbl>, two_hr_late <dbl>
Arrival delay is more important, since it can directly affect connecting flights (with departure delay, there is a chance that time can be made up in the air).
not_cancelled %>% count(dest) and not_cancelled %>% count(tailnum, wt = distance) (without using count()).not_cancelled <- flights %>%
filter(!is.na(dep_delay), !is.na(arr_delay))
not_cancelled %>%
count(dest)
## # A tibble: 104 x 2
## dest n
## <chr> <int>
## 1 ABQ 254
## 2 ACK 264
## 3 ALB 418
## 4 ANC 8
## 5 ATL 16837
## 6 AUS 2411
## 7 AVL 261
## 8 BDL 412
## 9 BGR 358
## 10 BHM 269
## # ... with 94 more rows
not_cancelled %>%
count(tailnum, wt = distance)
## # A tibble: 4,037 x 2
## tailnum n
## <chr> <dbl>
## 1 D942DN 3418.
## 2 N0EGMQ 239143.
## 3 N10156 109664.
## 4 N102UW 25722.
## 5 N103US 24619.
## 6 N104UW 24616.
## 7 N10575 139903.
## 8 N105UW 23618.
## 9 N107US 21677.
## 10 N108UW 32070.
## # ... with 4,027 more rows
Other approach:
not_cancelled %>%
group_by(dest) %>%
summarize(n = n())
## # A tibble: 104 x 2
## dest n
## <chr> <int>
## 1 ABQ 254
## 2 ACK 264
## 3 ALB 418
## 4 ANC 8
## 5 ATL 16837
## 6 AUS 2411
## 7 AVL 261
## 8 BDL 412
## 9 BGR 358
## 10 BHM 269
## # ... with 94 more rows
not_cancelled %>%
group_by(tailnum) %>%
summarize(wt = sum(distance))
## # A tibble: 4,037 x 2
## tailnum wt
## <chr> <dbl>
## 1 D942DN 3418.
## 2 N0EGMQ 239143.
## 3 N10156 109664.
## 4 N102UW 25722.
## 5 N103US 24619.
## 6 N104UW 24616.
## 7 N10575 139903.
## 8 N105UW 23618.
## 9 N107US 21677.
## 10 N108UW 32070.
## # ... with 4,027 more rows
is.na(dep_delay) | is.na(arr_delay)) is slightly suboptimal. Why? Which is the most important column?A flight cannot arrive without departing, this dep_delay appears to be the most important column and we can define cancelled flights by is.na(dep_delay).
flights %>% group_by(carrier, dest) %>% summarise(n()))flights %>%
group_by(carrier) %>%
summarize(avg_delay = mean(dep_delay, na.rm = TRUE)) %>%
arrange(desc(avg_delay))
## # A tibble: 16 x 2
## carrier avg_delay
## <chr> <dbl>
## 1 F9 20.2
## 2 EV 20.0
## 3 YV 19.0
## 4 FL 18.7
## 5 WN 17.7
## 6 9E 16.7
## 7 B6 13.0
## 8 VX 12.9
## 9 OO 12.6
## 10 UA 12.1
## 11 MQ 10.6
## 12 DL 9.26
## 13 AA 8.59
## 14 AS 5.80
## 15 HA 4.90
## 16 US 3.78
Carrier F9 has the worst delays.
Challenge:
flights %>%
group_by(carrier, dest) %>%
summarize(avg_delay = mean(dep_delay, na.rm = TRUE)) %>%
group_by(carrier) %>%
summarize(carrier_spread = mad(avg_delay, na.rm = TRUE)) %>%
arrange(desc(carrier_spread))
## # A tibble: 16 x 2
## carrier carrier_spread
## <chr> <dbl>
## 1 OO 11.0
## 2 AA 7.61
## 3 9E 7.27
## 4 VX 7.24
## 5 UA 5.25
## 6 YV 5.23
## 7 WN 4.65
## 8 DL 4.19
## 9 EV 4.18
## 10 FL 3.52
## 11 B6 3.36
## 12 MQ 3.21
## 13 US 1.54
## 14 AS 0.
## 15 F9 0.
## 16 HA 0.
By grouping by carrier and calculating the median absolute deviation of the mean departure delay for each carrier/destination pair, we can see which carriers experience a large amount of variation in the mean departure delays across destinations (i.e. which carriers have a high carrier_spread). Since these carriers are not consistently bad across destinations, this would indicate that bad airports are likely to blame.
sort argument to count() do. When might you use it?If TRUE, the sort argument to count() will sort output in descending order of n. You could use it instead of count() then arrange().
When you combine functions like mean(), sum(), median(), etc… with grouping, the function will be applied to each group rather than to the whole dataset.
tailnum) has the worst on-time record?flights %>%
group_by(tailnum) %>%
summarize(avg_delay = mean(dep_delay, na.rm = TRUE)) %>%
arrange(desc(avg_delay))
## # A tibble: 4,044 x 2
## tailnum avg_delay
## <chr> <dbl>
## 1 N844MH 297.
## 2 N922EV 274.
## 3 N587NW 272.
## 4 N911DA 268.
## 5 N851NW 233.
## 6 N654UA 227.
## 7 N928DN 203.
## 8 N7715E 186.
## 9 N665MQ 177.
## 10 N136DL 165.
## # ... with 4,034 more rows
Plane N844MH has the worst on-time record.
flights %>%
group_by(hour) %>%
summarize(avg_delay = mean(dep_delay, na.rm = TRUE)) %>%
arrange(avg_delay)
## # A tibble: 20 x 2
## hour avg_delay
## <dbl> <dbl>
## 1 5. 0.688
## 2 6. 1.64
## 3 7. 1.91
## 4 8. 4.13
## 5 9. 4.58
## 6 10. 6.50
## 7 11. 7.19
## 8 12. 8.61
## 9 13. 11.4
## 10 14. 13.8
## 11 23. 14.0
## 12 15. 16.9
## 13 16. 18.8
## 14 22. 18.8
## 15 17. 21.1
## 16 18. 21.1
## 17 21. 24.2
## 18 20. 24.3
## 19 19. 24.8
## 20 1. NaN
You should fly at 5 am if you want to avoid delays as much as possible.
flights %>%
group_by(dest) %>%
summarize(total_delay_dest = sum(arr_delay, na.rm = TRUE))
## # A tibble: 105 x 2
## dest total_delay_dest
## <chr> <dbl>
## 1 ABQ 1113.
## 2 ACK 1281.
## 3 ALB 6018.
## 4 ANC -20.
## 5 ATL 190260.
## 6 AUS 14514.
## 7 AVL 2089.
## 8 BDL 2904.
## 9 BGR 2874.
## 10 BHM 4540.
## # ... with 95 more rows
flights %>%
group_by(dest) %>%
mutate(total_delay_dest = sum(arr_delay, na.rm = TRUE),
prop_total_delay = arr_delay / total_delay_dest) %>%
select(dest, arr_delay, total_delay_dest, prop_total_delay)
## # A tibble: 336,776 x 4
## # Groups: dest [105]
## dest arr_delay total_delay_dest prop_total_delay
## <chr> <dbl> <dbl> <dbl>
## 1 IAH 11. 30046. 0.000366
## 2 IAH 20. 30046. 0.000666
## 3 MIA 33. 3467. 0.00952
## 4 BQN -18. 7322. -0.00246
## 5 ATL -25. 190260. -0.000131
## 6 ORD 12. 97352. 0.000123
## 7 FLL 19. 96153. 0.000198
## 8 IAD -14. 74631. -0.000188
## 9 MCO -8. 76185. -0.000105
## 10 ORD 8. 97352. 0.0000822
## # ... with 336,766 more rows
flights %>%
group_by(dest) %>%
arrange(air_time)
## # A tibble: 336,776 x 19
## # Groups: dest [105]
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 16 1355 1315 40. 1442
## 2 2013 4 13 537 527 10. 622
## 3 2013 12 6 922 851 31. 1021
## 4 2013 2 3 2153 2129 24. 2247
## 5 2013 2 5 1303 1315 -12. 1342
## 6 2013 2 12 2123 2130 -7. 2211
## 7 2013 3 2 1450 1500 -10. 1547
## 8 2013 3 8 2026 1935 51. 2131
## 9 2013 3 18 1456 1329 87. 1533
## 10 2013 3 19 2226 2145 41. 2305
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
flights %>%
group_by(dest) %>%
mutate(rel_air_time = air_time - min(air_time, na.rm = TRUE), na.rm = TRUE) %>%
select(dest, air_time, rel_air_time, tailnum) %>%
arrange(desc(rel_air_time))
## # A tibble: 336,776 x 4
## # Groups: dest [105]
## dest air_time rel_air_time tailnum
## <chr> <dbl> <dbl> <chr>
## 1 SFO 490. 195. N703TW
## 2 LAX 440. 165. N178DN
## 3 EGE 382. 163. N5DBAA
## 4 DEN 331. 149. N578UA
## 5 LAX 422. 147. N192DN
## 6 LAS 399. 143. N852UA
## 7 SFO 438. 143. N727TW
## 8 SAN 413. 134. N794JB
## 9 HNL 695. 133. N77066
## 10 SFO 426. 131. N319AA
## # ... with 336,766 more rows
Flight N703TW is most delayed in air.
flights %>%
group_by(dest) %>%
filter(n_distinct(carrier) >= 2) %>%
group_by(carrier) %>%
summarize(num_dest = n_distinct(dest)) %>%
arrange(desc(num_dest))
## # A tibble: 16 x 2
## carrier num_dest
## <chr> <int>
## 1 EV 51
## 2 9E 48
## 3 UA 42
## 4 DL 39
## 5 B6 35
## 6 AA 19
## 7 MQ 19
## 8 WN 10
## 9 OO 5
## 10 US 5
## 11 VX 4
## 12 YV 3
## 13 FL 2
## 14 AS 1
## 15 F9 1
## 16 HA 1
flights %>%
filter(!is.na(dep_delay)) %>%
group_by(tailnum) %>%
mutate(delay_long = dep_delay > 60,
delays_before = cumsum(delay_long)) %>%
filter(delays_before < 1) %>%
count(sort = TRUE)
## # A tibble: 3,817 x 2
## # Groups: tailnum [3,817]
## tailnum n
## <chr> <int>
## 1 N952UW 215
## 2 N315NB 161
## 3 N705TW 160
## 4 N706TW 149
## 5 N961UW 139
## 6 N713TW 128
## 7 N346NB 127
## 8 N765US 122
## 9 N721TW 120
## 10 N5FAAA 117
## # ... with 3,807 more rows
x, y, and z variables in diamonds. What do you learn? Think about a diamond and how you might decide which dimension is the length, width, and depth.ggplot(data = diamonds) +
geom_histogram(mapping = aes(x), binwidth = 0.5)
ggplot(data = diamonds) +
geom_histogram(mapping = aes(y), binwidth = 0.5)
ggplot(data = diamonds) +
geom_histogram(mapping = aes(z), binwidth = 0.5)
The distribution of the x variable is between 3.5 and 9 for the most part, while the distribution of the y variable is generally between 5 and 10, and the distribution of the z variable is generally between 2.5 and 5 (although there are a few outliers in the y and z variables). The x and y variables are likely length and width since they have similar distribution domains, while the ‘z’ variable is likely depth.
price. Do you discover anything unusual or surprising? (Hint: Carefully think about the binwidth and make sure you try a wide range of values.)Default binwidth:
ggplot(data = diamonds) +
geom_histogram(mapping = aes(price))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Binwidth = 50:
ggplot(data = diamonds) +
geom_histogram(mapping = aes(price), binwidth = 50)
Binwidth = 100:
ggplot(data = diamonds) +
geom_histogram(mapping = aes(price), binwidth = 100)
Zoom in on binwidth = 100 plot:
ggplot(data = diamonds) +
geom_histogram(mapping = aes(price), binwidth = 100) +
xlim(c(0,2500))
## Warning: Removed 26398 rows containing non-finite values (stat_bin).
Suprisingly, there are relatively few diamonds priced around $1500.
filter(diamonds, carat == 0.99) %>%
count()
## # A tibble: 1 x 1
## n
## <int>
## 1 23
filter(diamonds, carat == 1) %>%
count()
## # A tibble: 1 x 1
## n
## <int>
## 1 1558
ggplot(data = diamonds) +
geom_histogram(mapping = aes(carat), binwidth = 0.01) +
coord_cartesian(xlim = c(.98, 1.01))
There are 23 0.99 carat diamonds and 1558 1 carat diamonds. This is likely because even though 0.99 and 1 are very close, a 1 carat diamond seems more valuale/desirable since it is a full carat rather than a fraction of a carat.
coord_cartesian() vs xlim() or ylim() when zooming in on a histogram. What happens if you leave binwidth unset? What happens if you try and zoom so only half a bar shows?ggplot(data = diamonds) +
geom_histogram(mapping = aes(price))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
coord_cartesian():
ggplot(data = diamonds) +
geom_histogram(mapping = aes(price)) +
coord_cartesian(ylim = c(0, 4000))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
x_lim():
ggplot(data = diamonds) +
geom_histogram(mapping = aes(price)) +
ylim(0, 4000)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
coord_cartesian() draws the histogram first and then zooms in on the specified area, while xlim() and ylim() draw the histogram after dropping any values outside of the specified limits.
ggplot(flights, aes(dep_delay)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 8255 rows containing non-finite values (stat_bin).
flights %>%
mutate(carrier = ifelse(carrier == "UA", NA, carrier)) %>%
ggplot(aes(carrier)) +
geom_bar()
Missing values in a histogram are removed before drawing the histogram/calculating counts for each bin. Missing values in a bar chart are placed in a separate NA category. There is a difference because histograms are used to visualize continuous variables, while bar charts are used to visualize categorical variables. In a histogram, the variable on the x axis needs to have a numeric value. Since NA values do not have a numeric value, they cannot be placed in any bin and need to be removed. In a bar chart, on the other hand, the value on the x axis does not need to be numeric - the NA values can simply be counted together as another category.
na.rm = TRUE do in mean() and sum()?It removes the missing values before computing mean() and sum().
flights %>%
mutate(cancelled = is.na(dep_time),
sched_dep_time_min = ((sched_dep_time %/% 100 * 60) + (sched_dep_time %% 100)) %% 1440) %>%
ggplot(mapping = aes(x = sched_dep_time_min, y = ..density..)) +
geom_freqpoly(mapping = aes(color = cancelled), binwidth = 25)
coord_flip()?library(ggstance)
##
## Attaching package: 'ggstance'
## The following objects are masked from 'package:ggplot2':
##
## geom_errorbarh, GeomErrorbarh
ggplot(data = diamonds, mapping = aes(carat, cut)) +
geom_boxploth()
To get the same plot with coord_flip:
ggplot(data = diamonds, mapping = aes(cut, carat)) +
geom_boxplot() +
coord_flip()
geom_lv() to display the distribution of price vs cut. What do you learn? How do you interpret the plots?With boxplot:
ggplot(diamonds, aes(cut, price)) +
geom_boxplot()
With lvplot:
library(lvplot)
ggplot(diamonds, aes(cut, price)) +
geom_lv()
## Error: GeomLv was built with an incompatible version of ggproto.
## Please reinstall the package that provides this extension.
There are less outliers in the letter value plot than in the boxplot. The letter value plot also shows more quantiles than the boxplot (i.e. beyond the quartiles), which is useful for large datasets.
geom_violin() with a facetted geom_histogram(), or a coloured geom_freqpoly(). What are the pros and cons of each method?geom_violin():
ggplot(diamonds, aes(cut, price)) +
geom_violin()
Faceted geom_histogram():
ggplot(diamonds, aes(price)) +
geom_histogram() +
facet_grid(. ~ cut)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Colored geom_freqpoly():
ggplot(diamonds, aes(price, color = cut)) +
geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
geom-violin() and faceted geom_histogram() are useful for visualizing individual distributions, while colored geom_freqpoly() is useful for comparing distributions.
geom_jitter() to see the relationship between a continuous and categorical variable. The ggbeeswarm package provides a number of methods similar to geom_jitter(). List them and briefly describe what each one does.geom_quasirandom() and geom_beeswarm() both are used to reduce overplotting by plotting points that would ordinarily overlap next to each other. geom_quasirandom() spaces the points based on either a van der Corput sequence or Tukey texturing; geom_beeswarm() uses a point-size based offset.
Distribution of cut within color:
diamonds %>%
count(color, cut) %>%
group_by(color) %>%
mutate(prop = n / sum(n)) %>%
ggplot(mapping = aes(color, cut)) +
geom_tile(mapping = aes(fill = prop))
Distribution of color within cut:
diamonds %>%
count(color, cut) %>%
group_by(cut) %>%
mutate(prop = n / sum(n)) %>%
ggplot(mapping = aes(color, cut)) +
geom_tile(mapping = aes(fill = prop))
geom_tile() together with dplyr to explore how average flight delays vary by destination and month of year. What makes the plot difficult to read? How could you improve it?flights %>%
group_by(dest, month) %>%
summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
ggplot(mapping = aes(month, dest)) +
geom_tile(mapping = aes(fill = avg_dep_delay))
The plot is difficult to read since it is unordered, but this could be improved by sorting destinations by the average flight delay, as shown below. It is also hard to differentiate between the shades of blue, so a different color scale would also help to make the plot easier to read.
flights %>%
group_by(month, dest) %>%
summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
ungroup() %>%
group_by(dest) %>%
mutate(num_month = n()) %>%
ggplot(mapping = aes(factor(month), reorder(dest, num_month))) +
geom_tile(mapping = aes(fill = avg_dep_delay)) +
scale_fill_distiller(type = "div", palette = "Spectral")
aes(x = color, y = cut) rather than aes(x = cut, y = color) in the example above?Using aes(x = color, y = cut):
diamonds %>%
count(color, cut) %>%
ggplot(mapping = aes(x = color, y = cut)) +
geom_tile(mapping = aes(fill = n))
Using aes(x = cut, y = color):
diamonds %>%
count(color, cut) %>%
ggplot(mapping = aes(x = cut, y = color)) +
geom_tile(mapping = aes(fill = n))
It is better to use aes(x = color, y = cut) as this will produce approximately square tiles that are easier to compare to each other (compared to the rectangles that aes(x = cut, y = color) will produce).
cut_width() vs cut_number()? How does that impact a visualisation of the 2d distribution of carat and price?diamonds %>%
ggplot() +
geom_freqpoly(mapping = aes(x = price, color = cut_width(carat, .25)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
diamonds %>%
ggplot() +
geom_freqpoly(mapping = aes(x = price, color = cut_number(carat, 10)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Since cut_width() and cut_number() split a variable into groups, the number or width of bins chosen needs to be large enough to be able to visualize changes in distribution between groups but not too large.
library(ggstance)
ggplot(data = diamonds, mapping = aes(x = price, y = cut_number(carat, 10))) +
geom_boxploth()
The price distribution for larger diamonds is more spread out. This is expected, as other factors such as color, cut, etc… may alter the pricing of large diamonds (these factors are less noticeable with smaller diamonds).
diamonds %>%
ggplot() +
geom_boxplot(mapping = aes(x = cut, y = price, color = cut_number(carat, 5)))
diamonds %>%
mutate(carat_group = cut_number(carat, 10)) %>%
group_by(cut, carat_group) %>%
summarize(avg_price = mean(price)) %>%
ggplot() +
geom_tile(mapping = aes(x = cut, y = carat_group, fill = avg_price)) +
scale_fill_distiller(type = "div", palette = "Spectral")
x and y values, which makes the points outliers even though their x and y values appear normal when examined separately. Why is a scatterplot a better display than a binned plot for this case?Scatterplot:
ggplot(data = diamonds) +
geom_point(mapping = aes(x = x, y = y)) +
coord_cartesian(xlim = c(4, 11), ylim = c(4, 11))
Binned plot:
ggplot(data = diamonds) +
geom_bin2d(mapping = aes(x = x, y = y), bins = 100) +
coord_cartesian(xlim = c(4, 11), ylim = c(4, 11))
In the scatterplot, we can see the outliers as individual points, rather than as binned counts, making the scatterplot a better display than a binned plot for this case.